{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 1.采——网页的采集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "req = requests.get('https://wap.zol.com.cn/top/cell_phone/hot.html')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.抽——信息的抽取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = re.findall(\n",
    "    '<p class=\"pro-info-name f28\">(.*?)<\\/p>[\\S\\s]*?<span class=\"pro-info-price f24\">(.*?)<\\/span>',\n",
    "    req.text\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.存——保存采集结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('mobile.txt', 'w') as f:\n",
    "    for item in result:\n",
    "        f.write(item[0] + ' ' + item[1] + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat mobile.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 基础爬虫类(框架)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "\n",
    "class MyCrawler:\n",
    "    def __init__(self, filename):\n",
    "        self.filename = filename\n",
    "    \n",
    "    def download(self, url):\n",
    "        r = requests.get(url)\n",
    "        return r.text\n",
    "    \n",
    "    def extract(self, content, pattern):\n",
    "        result = re.findall(pattern, content)\n",
    "        return result\n",
    "    \n",
    "    def save(self, info):\n",
    "        with open(self.filename, 'a', encoding='utf-8') as f:\n",
    "            for item in info:\n",
    "                f.write('|||'.join(item) + '\\n')\n",
    "    \n",
    "    def crawl(self, url, pattern):\n",
    "        content = self.download(url)\n",
    "        info = self.extract(content, pattern)\n",
    "        self.save(info)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 对zol.com.cn进行测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "crawler = MyCrawler('mobile.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "CONTENT = crawler.download('https://wap.zol.com.cn/top/cell_phone/hot.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "crawler.crawl(\n",
    "    'https://wap.zol.com.cn/top/cell_phone/hot.html', \n",
    "    '<p class=\"pro-info-name f28\">(.*?)<\\/p>[\\S\\s]*?<span class=\"pro-info-price f24\">(.*?)<\\/span>'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat mobile.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 对bilibili进行测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_crawler = MyCrawler('bilibili.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "c = b_crawler.download('https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "info = b_crawler.extract(\n",
    "    c, \n",
    "    '<a\\shref=\"([^\"]*?)\"\\starget=\"_blank\"\\sclass=\"title\">(.*?)<\\/a>.*?<\\/i>(.*?)<\\/span>.*?<\\/i>(.*?)</span>.*?<\\/i>(.*?)</span>.*?<div>(\\d+)<\\/div>'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_crawler.save(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_crawler.crawl(\n",
    "    'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3',\n",
    "    '<a\\shref=\"([^\"]*?)\"\\starget=\"_blank\"\\sclass=\"title\">(.*?)<\\/a>.*?<\\/i>(.*?)<\\/span>.*?<\\/i>(.*?)</span>.*?<\\/i>(.*?)</span>.*?<div>(\\d+)<\\/div>',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm bilibili.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('bilibili.txt','r',encoding='utf-8') as f:\n",
    "    lines = f.read()\n",
    "    for line in lines.split('\\n'):\n",
    "        print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat bilibili.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_crawler.crawl(\n",
    "    'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3',\n",
    "    '<a\\shref=\"([^\"]*?)\"\\starget=\"_blank\"\\sclass=\"title\">(.*?)<\\/a>.*?<\\/i>(.*?)<\\/span>.*?<\\/i>(.*?)</span>.*?<\\/i>(.*?)</span>.*?<div>(\\d+)<\\/div>'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 对豆瓣进行测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "b_crawler = MyCrawler('douban_book.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b_crawler.download('https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 用curl.trillworks.com实现Chrome网络请求的“克隆”"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "\n",
    "headers = {\n",
    "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',\n",
    "}\n",
    "\n",
    "response = requests.get('https://book.douban.com/tag/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C', headers=headers)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "47944"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(response.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'Neural Networks and Deep Learning' in response.text"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
